HaiBiostat: Series 7 -- Fitting a Logistic Regression in Bayes

Hai Nguyen

Logistic regression

Dichotomous \(y\) with multiple metric predictors.
Logistic regression model would be constructed based on diagram.
We use Height-Weight Data

Prediction of gender by hight and weight. Probability close to 0.5 (balanced data)

mydf <- read.csv(file = 'data/HtWtData300.csv')
head(mydf)

  male height weight
1    0   64.0  136.4
2    0   62.3  215.1
3    1   67.9  173.6
4    0   64.2  117.3
5    0   64.8  123.3
6    0   57.5   96.5

Probability close to 0.5

table(mydf$male)


  0   1 
152 148

prop.table(table(mydf$male))


        0         1 
0.5066667 0.4933333

Firstly, we try to fit with glm

fmod1 <- glm(male ~ height + weight, 
             family = "binomial", data = mydf)
summary(fmod1)


Call:
glm(formula = male ~ height + weight, family = "binomial", data = mydf)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7751  -0.4674  -0.0165   0.3655   3.1906  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -59.994880   7.136907  -8.406  < 2e-16 ***
height        0.890858   0.108873   8.183 2.78e-16 ***
weight        0.005320   0.005225   1.018    0.309    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 415.83  on 299  degrees of freedom
Residual deviance: 189.49  on 297  degrees of freedom
AIC: 195.49

Number of Fisher Scoring iterations: 6

\(\Rightarrow\) We will see the estimates as same as the Bayesian method.

plot(mydf$height, 
     mydf$weight,pch=16, col="blue")
points(mydf$height[mydf$male==1], 
       mydf$weight[mydf$male==1], 
       col="orange", pch=16)
legend("topleft", legend=c("Female","Male"), 
       col=c("blue","orange"), pch=16)

Model for Dichotomous y with multiple metric predictors set up in Stan

But firstly, we look at the simplified modelString of Stan:

# Not eval
modelString <- "
  data{
    int<lower=0> N;
    real xc[N];
    int xb[N];
    int<lower=0,upper=1> y[N]; //Binary outcome
  }
  parameters{
    real b0;
    real b1;
    real b2;
  }
  model{
    b0 ~ normal(0,1);
    b1 ~ normal(0,1);
    b2 ~ normal(0,1);
    y  ~ bernoulli_logit(b0 + b1*xc + b2*xb);
  }
"

Then move on the destination one:

modelString <- "
  data {
    int<lower=1> Ntotal; // num of observations
    int<lower=1> Nx;     // num of predictors
    int<lower=0, upper=1> y[Ntotal];
    matrix[Ntotal, Nx] x;
  }
  
  transformed data {
    vector[Nx] meanX;
    vector[Nx] sdX;
    matrix[Ntotal, Nx] zx; // normalized

    for ( j in 1:Nx ) {
      meanX[j] = mean(x[,j]);
      sdX[j] = sd(x[,j]);
        for (i in 1:Ntotal) {
          zx[i,j] = (x[i,j] - meanX[j]) / sdX[j];
        }
      }
  }
    
  parameters {
    real zbeta0;
    vector[Nx] zbeta;
  }

  transformed parameters{
    vector[Ntotal] mu;
    mu = zbeta0 + zx * zbeta;  // matrix product
  }

  model {
    zbeta0 ~ normal(0, 2);
    zbeta  ~ normal(0, 2);
    y ~ bernoulli_logit(mu);
  }

  generated quantities { 
    // Transform to original scale:
    real beta0; 
    vector[Nx] beta;
    // .* and ./ are element-wise product and division
    beta0 = zbeta0 - sum(zbeta .* meanX ./ sdX);
    beta = zbeta ./ sdX;
  } 
"

stanDsoLogistic <- stan_model(model_code=modelString)

Fit model

heightWeightDataList <- list(Ntotal = nrow(mydf),
                             y = mydf$male,
                             x = cbind(mydf$height, mydf$weight),
                             Nx = 2)

fit <- sampling(stanDsoLogistic,
                data = heightWeightDataList, 
                pars = c('beta0', 'beta'),
                iter = 5000, chains = 2, cores = 2
)

Analyze fitted model using shinystan

launch_shinystan(fit)

Analyze parameters.

stan_ac(fit, separate_chains = T)

pairs(fit)

plot(fit)

plot(fit,pars=c("beta"))

plot(fit,pars=c("beta[2]"))

summary(fit)$summary[,c(1,4,8)]

                 mean          2.5%        97.5%
beta0   -58.992924983 -7.323391e+01 -46.12530030
beta[1]   0.875179982  6.768256e-01   1.09355795
beta[2]   0.005549854 -4.832449e-03   0.01596438
lp__    -97.715598858 -1.009842e+02 -96.26790960

Parameter \(\beta_2\) is not significant with 95% HDI.

stan_dens(fit)

estimBetas<-summary(fit)$summary[1:3,1]

Plot the data with separating hyperplane.

plot(mydf$height, mydf$weight, pch=16, col="blue")
points(mydf$height[mydf$male==1], mydf$weight[mydf$male==1], col = "orange", pch = 16)
lines(mydf$height, -(estimBetas[1]+estimBetas[2]*mydf$height)/estimBetas[3])
legend("topleft", legend = c("Female","Male"), col = c("blue","orange"), pch=16)

Prediction of gender by hight and weight. Probability close to extreme (imbalanced sample)

Now try to remove almost all males from the sample to see what may happen when there are few 1’s observed.

In the original sample the proportion of males is:

mean(mydf$male)

[1] 0.4933333

Sample with females is

females <- mydf[mydf$male == 0,]

Select first 15 males.

males <- mydf[mydf$male == 1,][1:15,] # just 15 males (originally was ~150)
mydf_sparse <- rbind(males,females)
rownames(mydf_sparse) <- NULL
head(mydf_sparse, 20)

   male height weight
1     1   67.9  173.6
2     1   70.2  191.1
3     1   71.1  193.9
4     1   66.5  127.1
5     1   75.1  204.4
6     1   64.6  143.4
7     1   69.2  124.4
8     1   68.1  140.9
9     1   72.6  164.7
10    1   71.5  193.6
11    1   76.0  180.0
12    1   69.7  155.0
13    1   73.3  188.2
14    1   68.3  178.6
15    1   70.8  207.3
16    0   64.0  136.4
17    0   62.3  215.1
18    0   64.2  117.3
19    0   64.8  123.3
20    0   57.5   96.5

Fit sparse model

fmod2 <- glm(male ~ height + weight, family = "binomial", data = mydf_sparse)
summary(fmod2)


Call:
glm(formula = male ~ height + weight, family = "binomial", data = mydf_sparse)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.28353  -0.17117  -0.06829  -0.01696   3.15051  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -84.24302   20.03395  -4.205 2.61e-05 ***
height        1.25377    0.30728   4.080 4.50e-05 ***
weight       -0.01190    0.01248  -0.953     0.34    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 100.909  on 166  degrees of freedom
Residual deviance:  35.582  on 164  degrees of freedom
AIC: 41.582

Number of Fisher Scoring iterations: 8

heightWeightSparseDataList<-list(Ntotal=nrow(mydf_sparse),
                                 y=mydf_sparse$male,
                                 x=cbind(mydf_sparse$height, mydf_sparse$weight),
                                 Nx=2)

fit_sparse <- sampling(stanDsoLogistic,
                       data=heightWeightSparseDataList, 
                       pars=c('beta0', 'beta'),
                       iter=5000, chains = 2, cores = 2
)

stan_ac(fit_sparse, separate_chains = T)

pairs(fit_sparse)

plot(fit_sparse)

plot(fit_sparse,pars=c("beta"))

plot(fit_sparse,pars=c("beta[2]"))

Compare summary of the two studies

rbind(beta0reg=summary(fit)$summary[1,c(1,3)],
      beta0glm=c(summary(fmod1)$coefficients[1,1],summary(fmod1)$coefficients[1,2]*sqrt(dim(mydf)[1])),
      beta0sparce=summary(fit_sparse)$summary[1,c(1,3)],
      beta0sparceglm=c(summary(fmod2)$coefficients[1,1],summary(fmod2)$coefficients[1,2]*sqrt(dim(mydf_sparse)[1])))

                    mean         sd
beta0reg       -58.99292   6.931705
beta0glm       -59.99488 123.614858
beta0sparce    -65.45501  12.684797
beta0sparceglm -84.24302 258.895713

rbind(beta1reg=summary(fit)$summary[2,c(1,3)],
      beta1glm=c(summary(fmod1)$coefficients[2,1],summary(fmod1)$coefficients[2,2]*sqrt(dim(mydf)[1])),
      beta1sparce=summary(fit_sparse)$summary[2,c(1,3)],
      beta1sparceglm=c(summary(fmod2)$coefficients[2,1],summary(fmod2)$coefficients[2,2]*sqrt(dim(mydf_sparse)[1])))

                    mean        sd
beta1reg       0.8751800 0.1059241
beta1glm       0.8908578 1.8857272
beta1sparce    0.9665638 0.1937020
beta1sparceglm 1.2537728 3.9708903

rbind(beta2reg=summary(fit)$summary[3,c(1,3)],
      beta2glm=c(summary(fmod1)$coefficients[3,1],summary(fmod1)$coefficients[3,2]*sqrt(dim(mydf)[1])),
      beta2sparce=summary(fit_sparse)$summary[3,c(1,3)],
      beta2sparceglm=c(summary(fmod2)$coefficients[3,1],summary(fmod2)$coefficients[3,2]*sqrt(dim(mydf_sparse)[1])))

                       mean          sd
beta2reg        0.005549854 0.005325699
beta2glm        0.005319759 0.090499569
beta2sparce    -0.007222053 0.010259493
beta2sparceglm -0.011900490 0.161304086

rbind(beta0reg=summary(fit)$summary[1,c(4,8)],
      beta0sparce=summary(fit_sparse)$summary[1,c(4,8)])

                 2.5%     97.5%
beta0reg    -73.23391 -46.12530
beta0sparce -92.80089 -42.89777

rbind(beta1reg=summary(fit)$summary[2,c(4,8)],
      beta1sparce=summary(fit_sparse)$summary[2,c(4,8)])

                 2.5%    97.5%
beta1reg    0.6768256 1.093558
beta1sparce 0.6177309 1.379462

rbind(beta2reg=summary(fit)$summary[3,c(4,8)],
      beta2sparce=summary(fit_sparse)$summary[3,c(4,8)])

                    2.5%      97.5%
beta2reg    -0.004832449 0.01596438
beta2sparce -0.028192807 0.01230010

HDI of both slopes widened significantly in the sample with more extreme disproportion.
Standard deviations of betas also increase dramatically, especially fit with glm.

Robust logistic regression

Prediction of gender by height and weight. Robust model

Observe the data of the previous section.
Plot male (1) and female (0) groups with respect to weight.

plot(mydf$weight,mydf$male)

In the lower right corner there are some outliers representing heavy females.
Such observations cause bias of model parameters.

plot(mydf$height,mydf$male)

On the plot relative to height outliers are short males.

What does a model robust to such outliers look like?

Robust logistic regression is a mix of “guessing model” and logistic model

\[\mu = \alpha \frac{1}{2} + (1 - \alpha) \ \text{logistic}\Big(\beta_0 + \sum_j \beta_j x_j\Big)\]

When \(\alpha = 0\) all observations fit within logistic regression pattern, i.e. the classes are separated well enough to be explained by logistic sigmoid function. In this case the model becomes pure logistic regression.
When \(\alpha = 1\) the classes overlap completely and the model can only guess with probability 0.5 to which class the observation belongs.

Typically \(\alpha\) is small (few outliers showing that predictor points to wrong class).
Select beta distribution as a prior to \(\alpha\) with high concentration near zero: dbeta(1,9).

Argument <- seq(from=0,to=1,by=.01)
plot(Argument, dbeta(Argument, 1, 9), type="l")

The modified model is on the diagram .

modelString= 
"data {                   // ROBUST LOGISTIC REGRESSION
    int<lower=1> Ntotal;  // num of observations
    int<lower=1> Nx;      // num of predictors
    int<lower=0, upper=1> y[Ntotal];
    matrix[Ntotal, Nx] x;
}
transformed data {
    vector[Nx] meanX;
    vector[Nx] sdX;
    matrix[Ntotal, Nx] zx;  // normalized
    
    for ( j in 1:Nx ) {
        meanX[j] = mean(x[,j]);
        sdX[j] = sd(x[,j]);
        for ( i in 1:Ntotal ) {
            zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
        }
    }
}
parameters {
    real zbeta0;
    vector[Nx] zbeta;
    real<lower=0,upper=1> guess;  // mixture param
}
transformed parameters{
    vector[Ntotal] mu;
    for ( i in 1:Ntotal ) {
        mu[i] = guess * (1/2.0) + (1-guess) * inv_logit(zbeta0 + zx[i,] * zbeta);
    }
}
model {
    zbeta0 ~ normal(0, 2);
    zbeta  ~ normal(0, 2);
    guess ~ beta(1, 9);
    y ~ bernoulli(mu);
}
generated quantities { 
    // Transform to original scale:
    real beta0; 
    vector[Nx] beta;
    // .* and ./ are element-wise product and division
    beta0 =  zbeta0 - sum( zbeta .* meanX ./ sdX );
    beta =  zbeta ./ sdX;
}
"

stanDsoRobustLogistic <- stan_model(model_code=modelString)

Run robust MCMC with the hight/weight data.

fitRobust <- sampling(stanDsoRobustLogistic,
                data=heightWeightDataList, 
                pars=c('beta0', 'beta', 'guess'),
                iter=5000, chains = 2, cores = 2)

Analyze results.

stan_ac(fitRobust, separate_chains = T)

pairs(fitRobust)

plot(fitRobust)

plot(fitRobust,pars=c("beta[1]"))

plot(fitRobust,pars=c("beta[2]"))

plot(fitRobust,pars=c("guess"))

rbind(summary(fitRobust)$summary[,c(1,4,7)],
      summary(fit)$summary[,c(1,4,8)]
)

                 mean          2.5%           75%
beta0   -6.794151e+01 -9.040257e+01  -60.86879681
beta[1]  1.004011e+00  7.406505e-01    1.09734128
beta[2]  7.843206e-03 -4.273673e-03    0.01185354
guess    3.618337e-02  2.272841e-03    0.05068176
lp__    -1.019951e+02 -1.056117e+02 -100.89209843
beta0   -5.899292e+01 -7.323391e+01  -46.12530030
beta[1]  8.751800e-01  6.768256e-01    1.09355795
beta[2]  5.549854e-03 -4.832449e-03    0.01596438
lp__    -9.771560e+01 -1.009842e+02  -96.26790960

Note positive correlation between guess and the slope \(\beta_1\). The more outliers the higher is the slope.

Since \(\beta_2\) does not seem significant. Now, we fit both models with height as only predictor.

heightWeightDataList<-list(Ntotal=nrow(mydf),
                          y=mydf$male,
                          x=cbind(mydf$height),
                          Nx=1)


fit <- sampling(stanDsoLogistic,
                data=heightWeightDataList, 
                pars=c('beta0', 'beta'),
                iter=5000, chains = 2, cores = 2
)
fitRobust <- sampling(stanDsoRobustLogistic,
                data=heightWeightDataList, 
                pars=c('beta0', 'beta', 'guess'),
                iter=5000, chains = 2, cores = 2
)
pairs(fit)

pairs(fitRobust)

plot(fit)

plot(fit,pars=c("beta"))

plot(fitRobust)

plot(fitRobust,pars=c("beta[1]","guess"))

plot(fitRobust,pars=c("guess"))

rbind(summary(fitRobust)$summary[,c(1,4,7)],
      summary(fit)$summary[,c(1,4,8)]
)

                 mean          2.5%           75%
beta0    -66.88526189 -8.866638e+01  -60.00415120
beta[1]    1.00688726  7.605573e-01    1.09231329
guess      0.03243383  1.645023e-03    0.04555081
lp__    -102.24849261 -1.055917e+02 -101.29745249
beta0    -59.40455205 -7.350401e+01  -46.79642504
beta[1]    0.89464752  7.033695e-01    1.10703605
lp__     -97.75056275 -1.006445e+02  -96.76446560

Compare probabilities predicted by logistic regression and robust logistic regression.

# Coefficients
meanBeta0Robust<-summary(fitRobust)$summary[1,1]
meanBeta1Robust<-summary(fitRobust)$summary[2,1]
guess<-summary(fitRobust)$summary[3,1]
meanBeta0<-summary(fit)$summary[1,1]
meanBeta1<-summary(fit)$summary[2,1]

#Linear predictors and probabilities
linPredRobust_Male.Height<-meanBeta0Robust+meanBeta1Robust*mydf$height
pRobustMail_height<-guess/2+(1-guess)*exp(linPredRobust_Male.Height)/(1+exp(linPredRobust_Male.Height))
linPred_Male.Height<-meanBeta0+meanBeta1*mydf$height
pMail_height<-exp(linPred_Male.Height)/(1+exp(linPred_Male.Height))

# Plot
plot(mydf$height,mydf$male,pch=16)
points(mydf$height,pRobustMail_height,col="orange",pch=16)
points(mydf$height,pMail_height,col="cyan",pch=16)
legend("topleft",
       legend=c("Actual","Prob Logistic","Prob. Robust"),
       col=c("black","cyan","orange"), pch=16)

Logistic probability is more extreme: higher in the area of low-height male observations because it is biased by an outlier of short male below the average female height. But the far tails are closer to 0.
Robust logistic regression did not react to this observation as much!!
Robust model does not produce probabilities too close to 0 and 1.

Prediction of gender by height and weight with sparse data. Robust model

Repeat the same comparison with the sparse data.

Create data list with one predictor.

heightWeightSparseDataList <- list(Ntotal = nrow(mydf_sparse),
                                   y = mydf_sparse$male,
                                   x = cbind(mydf_sparse$height),
                                   Nx = 1)

Fit both models with only one predictor height

fitSparse <- sampling(stanDsoLogistic,
                       data=heightWeightSparseDataList, 
                       pars=c('beta0', 'beta'),
                       iter=5000, chains = 2, cores = 2)

fitSparseRobust <- sampling(stanDsoRobustLogistic,
                data=heightWeightSparseDataList, 
                pars=c('beta0', 'beta', 'guess'),
                iter=5000, chains = 2, cores = 2)

Analyze the models.

pairs(fitSparse)

pairs(fitSparseRobust)

plot(fitSparse)

plot(fitSparse,pars=c("beta"))

plot(fitSparseRobust)

plot(fitSparseRobust,pars=c("beta[1]","guess"))

plot(fitSparseRobust,pars=c("guess"))

rbind(summary(fitSparseRobust)$summary[,c(1,4,7)],
      summary(fitSparse)$summary[,c(1,4,8)]
)

                mean          2.5%          75%
beta0   -65.21124168 -92.353657075 -56.21273405
beta[1]   0.94557811   0.613894602   1.06360012
guess     0.01666415   0.000493968   0.02404738
lp__    -28.82697378 -32.226208453 -27.88666403
beta0   -62.74745106 -90.025601491 -40.85149755
beta[1]   0.90990203   0.584875829   1.31576470
lp__    -23.23831969 -26.128614012 -22.20502250

Make plot of probabilities.

# Coefficients
meanBeta0Robust<-summary(fitSparseRobust)$summary[1,1]
meanBeta1Robust<-summary(fitSparseRobust)$summary[2,1]
guess<-summary(fitSparseRobust)$summary[3,1]
meanBeta0<-summary(fitSparse)$summary[1,1]
meanBeta1<-summary(fitSparse)$summary[2,1]

#Linear predictors and probabilities
linPredRobust_Male.Height<-meanBeta0Robust+meanBeta1Robust*mydf_sparse$height
pRobustMail_height<-guess/2+(1-guess)*exp(linPredRobust_Male.Height)/(1+exp(linPredRobust_Male.Height))
linPred_Male.Height<-meanBeta0+meanBeta1*mydf_sparse$height
pMail_height<-exp(linPred_Male.Height)/(1+exp(linPred_Male.Height))

# Plot
plot(mydf_sparse$height,mydf_sparse$male,pch=16)
points(mydf_sparse$height,pRobustMail_height,col="orange",pch=16)
points(mydf_sparse$height,pMail_height,col="cyan",pch=16)
legend("topleft",
       legend=c("Actual","Prob Logistic","Prob. Robust"),
       col=c("black","cyan","orange"),pch=16)

Anesthesia example

This data example is from library DAAG.
Thirty patients were given an anesthetic agent maintained at a predetermined concentration level (conc) for 15 minutes before making an incision. It was then noted whether the patient moved (1), i.e. jerked or twisted.

Data

library(DAAG)
head(anesthetic)

  move conc   logconc nomove
1    0  1.0 0.0000000      1
2    1  1.2 0.1823216      0
3    0  1.4 0.3364722      1
4    1  1.4 0.3364722      0
5    1  1.2 0.1823216      0
6    0  2.5 0.9162907      1

Use column move as response and column logconc as predictor.

Prepare the data.

dataListAnesthetic <- list(Ntotal=nrow(anesthetic),
                           y=anesthetic$move,
                           x=cbind(anesthetic$logconc),
                           Nx=1)

Logistic model by `glm()`

logRegr <- glm(move ~ logconc, 
               data=anesthetic,
               family="binomial")
summary(logRegr)


Call:
glm(formula = move ~ logconc, family = "binomial", data = anesthetic)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1477  -0.6962  -0.1209   0.7586   1.7528  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)   0.8077     0.5709   1.415  0.15715   
logconc      -6.2457     2.2618  -2.761  0.00575 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 41.455  on 29  degrees of freedom
Residual deviance: 28.007  on 28  degrees of freedom
AIC: 32.007

Number of Fisher Scoring iterations: 5

predLogRegr <- predict(logRegr, type="response")

Running chains

Run MCMC using logistic and robust logistic models.

fitAnesth <- sampling(stanDsoLogistic,
                data=dataListAnesthetic, 
                pars=c('beta0', 'beta'),
                iter=5000, chains = 2, cores = 2
)

fitRobustAnesth <- sampling(stanDsoRobustLogistic,
                             data=dataListAnesthetic, 
                              pars=c('beta0', 'beta', 'guess'),
                              iter=5000, chains = 2, cores = 2)

Analysis of logistic model

Extract diagnostics

summary(fitAnesth)$summary[,c(1,4,8:10)]

               mean        2.5%      97.5%    n_eff      Rhat
beta0     0.8195786  -0.1744736   1.944504 3161.224 1.0004414
beta[1]  -6.1980944 -10.7384617  -2.545248 2779.034 0.9997342
lp__    -15.4770746 -18.1920124 -14.472321 2004.009 1.0012005

stan_ac(fitAnesth, separate_chains = T)

stan_trace(fitAnesth)

pairs(fitAnesth,pars=c("beta0","beta"))

plot(fitAnesth)

Analysis of robust logistic model

summary(fitRobustAnesth)$summary[,c(1,4,8:10)]

                mean          2.5%       97.5%    n_eff      Rhat
beta0     1.03390884  -0.204622023   2.7745487 2884.167 1.0002948
beta[1]  -7.38947230 -14.369144436  -2.7431485 2649.559 1.0022518
guess     0.09805441   0.003237798   0.2942556 3631.455 0.9997073
lp__    -19.53242382 -22.933897477 -18.0116805 1675.079 1.0019349

stan_ac(fitRobustAnesth, separate_chains = T)

stan_trace(fitRobustAnesth)

pairs(fitRobustAnesth,pars=c("beta0","beta","guess"))

plot(fitRobustAnesth)

plot(fitRobustAnesth,pars=c("guess"))

Parameter guess is almost \(10%\). This means that there should be a significant difference between the two models.
However, 95% HDI almost covers \(0\).

Comparison of logistic and robust logistic models

Compare intercepts.

rbind(Logistic=summary(fitAnesth)$summary[1,c(1,4,8)],
      Robust=summary(fitRobustAnesth)$summary[1,c(1,4,8)])

              mean       2.5%    97.5%
Logistic 0.8195786 -0.1744736 1.944504
Robust   1.0339088 -0.2046220 2.774549

Compare slopes.

rbind(Logistic=summary(fitAnesth)$summary[2,c(1,4,8)],
      Robust=summary(fitRobustAnesth)$summary[2,c(1,4,8)])

              mean      2.5%     97.5%
Logistic -6.198094 -10.73846 -2.545248
Robust   -7.389472 -14.36914 -2.743148

Compare probabilities.

# Coefficients
meanBeta0Robust<-summary(fitRobustAnesth)$summary[1,1]
meanBeta1Robust<-summary(fitRobustAnesth)$summary[2,1]
guess<-summary(fitRobustAnesth)$summary[3,1]
meanBeta0<-summary(fitAnesth)$summary[1,1]
meanBeta1<-summary(fitAnesth)$summary[2,1]

#Linear predictors and probabilities
linPredRobust_Move<-meanBeta0Robust+meanBeta1Robust*anesthetic$logconc
pRobustMove<-guess/2+(1-guess)*exp(linPredRobust_Move)/(1+exp(linPredRobust_Move))
linPred_Move<-meanBeta0+meanBeta1*anesthetic$logconc
pMove<-exp(linPred_Move)/(1+exp(linPred_Move))

# Plot
plot(anesthetic$logconc,anesthetic$move,pch=16)
points(anesthetic$logconc,pRobustMove,col="orange",pch=15)
points(anesthetic$logconc,pMove,col="cyan",pch=17)
points(anesthetic$logconc,predLogRegr,col="purple",pch=25)
legend("topright",
       legend=c("Actual","Prob Logistic","Prob. Robust","Glm"),
       col=c("black","cyan","orange","purple"),pch=c(16,17,15,25))

Again, robust method does not return extreme probabilities.

Softmax regression

Softmax (or multinomial logistic regression) is a generalization of logistic regression to the case where we want to handle multiple classes. In logistic regression, the response was binary: \(y^{(i)} \in \{0,1\}\), meanwhile softmax handles \(y^{(i)} \in \{1, ..., K\}\)

It is based on modification of odds ratio from
\[\frac{p}{1-p}\] to \[\frac{p_s}{p_r},\]
where \(p_r\) is probability of one of several classes selected as reference (for example, control group).

The model changes very little from logistic regression and is shown on this diagram.

Because adding constants to coefficients does not change softmax the reference class coefficients are assumed equal to zero.

Simulated data from the book

The data SoftmaxRegData1.csv are from Kruschke, 2015 chapter 22.

myData <- read.csv(file="data/SoftmaxRegData2.csv" )
head(myData)

           X1         X2 Y
1 -0.08714736 -1.0813422 2
2 -0.72256565 -1.5838631 2
3  0.17918961  0.9717904 3
4 -1.15975176  0.5026244 3
5 -0.72711762  1.3757045 3
6  0.53341559  1.7746506 3

table(myData$Y, useNA = "always")


   1    2    3    4 <NA> 
  58  145  139  133    0

idx2<-myData$Y==2
idx3<-myData$Y==3
idx4<-myData$Y==4

plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2],myData$X2[idx2],pch=16,col="orange")
points(myData$X1[idx3],myData$X2[idx3],pch=16,col="cyan")
points(myData$X1[idx4],myData$X2[idx4],pch=16,col="magenta")

Prepare data list

dataListSoftmax <- list(N=nrow(myData),  # num of observations
                          K=max(myData$Y), # num of groups
                          y=myData$Y,
                          x=cbind(x1 = myData$X1, x2 = myData$X2),
                          D=2)  # num of predictiors

Softmax model

Describe the model.

modelString="
data {
    int<lower=2> K;  // num of groups
    int<lower=0> N;  // num of observations
    int<lower=1> D;  // num of predictors 
    int<lower=1,upper=K> y[N];
    matrix[N, D] x;
}
transformed data {
    row_vector[D] zeros;
    row_vector[D] x_m;  // x means
    row_vector[D] x_sd; // x standard deviations
    matrix[N, D] zx;    // normalized x
    zeros = rep_row_vector(0, D); // coefficients are zeros for the baseline class
    for (j in 1:D) {
        x_m[j] = mean(x[,j]);
        x_sd[j] = sd(x[,j]);
        zx[,j] = (x[,j] - x_m[j]) / x_sd[j];
    }
}
parameters {
    matrix[K-1,D] zbeta_raw;  // K-1 makes model identifiable
    vector[K-1] zbeta0_raw;
}
transformed parameters {
    vector[K] zbeta0;   // intersection coeffs
    matrix[K, D] zbeta; // predictor coeffs
    zbeta0 = append_row(0, zbeta0_raw);
    zbeta = append_row(zeros, zbeta_raw); // add zeros for coefficients of the baseclass
}
model {
    zbeta0_raw ~ normal(0, 5);
    for (k in 1:(K-1))
        zbeta_raw[k,] ~ normal(0, 5);
    for (n in 1:N)
        y[n] ~ categorical(softmax(zbeta0 + zbeta * to_vector(zx[n,]) ));
}
generated quantities {
    vector[K] beta0;
    matrix[K, D] beta;
    // transform zbetas to original betas:
    for (k in 1:K) {
        beta0[k] = zbeta0[k];
        for (j in 1:D) {
            beta0[k] = beta0[k] - zbeta[k,j] * x_m[j] / x_sd[j];
            beta[k,j] = zbeta[k,j] / x_sd[j];
        }
     }
}
"

Create DSO.

modelSoftmax <- stan_model(model_code=modelString)

fit <- sampling(modelSoftmax,
                data=dataListSoftmax,
                pars=c('beta0', 'beta'),
                iter=5000, chains = 2, cores = 2)

Analysis

Analyze fitted model using shinystan, but check directly for purposely demonstration.

summary(fit)$summary[,c(1,4,8:10)]

                  mean        2.5%        97.5%    n_eff      Rhat
beta0[1]     0.0000000    0.000000    0.0000000      NaN       NaN
beta0[2]    -3.8862330   -5.342788   -2.6322714 2248.385 1.0004666
beta0[3]    -3.4331047   -4.773270   -2.2796907 3296.605 0.9997387
beta0[4]    -3.8265114   -5.189532   -2.6072841 3164.843 0.9996566
beta[1,1]    0.0000000    0.000000    0.0000000      NaN       NaN
beta[1,2]    0.0000000    0.000000    0.0000000      NaN       NaN
beta[2,1]   -5.5316729   -7.270050   -4.0375865 2447.257 1.0004653
beta[2,2]   -5.3960430   -7.160777   -3.8669145 2404.709 1.0003155
beta[3,1]   -1.4401162   -2.480155   -0.4521930 3834.628 0.9997128
beta[3,2]    8.7333497    6.723432   11.0850555 2888.872 0.9999173
beta[4,1]    8.1774008    6.235139   10.3564345 3186.626 0.9996173
beta[4,2]   -0.7379993   -1.678778    0.2006212 3773.177 0.9999730
lp__      -123.4277181 -128.596525 -120.1976451 1642.521 1.0026270

The model has 4 classes and 2 predictors. The returned coefficients form a matrix \[\lambda_{i,1} = \beta_{0,1} + \beta{1,1} x_{i,1} + \beta_{2,1} x_{i,2} \\ \lambda_{i,2} = \beta_{0,2} + \beta{1,2} x_{i,1} + \beta_{2,2} x_{i,2} \\ \lambda_{i,3} = \beta_{0,3} + \beta{1,3} x_{i,1} + \beta_{2,31} x_{i,2} \\ \lambda_{i,4} = \beta_{0,4} + \beta{1,4} x_{i,1} + \beta_{2,4} x_{i,2}\]

In this system first class is selected as reference, so \(\beta_{0,1} = \beta_{1,1} = \beta_{2,1} = 0\).

stan_ac(fit, separate_chains = T)

stan_trace(fit)

pairs(fit,pars=c("beta0"))

pairs(fit,pars=c("beta"))

plot(fit)

Classification

To predict classes use formula for probability of class \(k\) \[p_k = \frac{e^{\lambda_k}}{\sum_s e^{\lambda_s}}.\]

Create matrix of coefficients.

SoftmaxCoeff<-summary(fit)$summary[1:12,c(1)]
SoftmaxCoeff<-cbind(SoftmaxCoeff[1:4],matrix(SoftmaxCoeff[-(1:4)],ncol=2,byrow=T))
rownames(SoftmaxCoeff)<-paste0("Class",1:4)
SoftmaxCoeff

            [,1]      [,2]       [,3]
Class1  0.000000  0.000000  0.0000000
Class2 -3.886233 -5.531673 -5.3960430
Class3 -3.433105 -1.440116  8.7333497
Class4 -3.826511  8.177401 -0.7379993

Create linear predictors.

head(myData)

           X1         X2 Y
1 -0.08714736 -1.0813422 2
2 -0.72256565 -1.5838631 2
3  0.17918961  0.9717904 3
4 -1.15975176  0.5026244 3
5 -0.72711762  1.3757045 3
6  0.53341559  1.7746506 3

linPredictors<-apply(SoftmaxCoeff[,-1],1,function(z) z%*%t(myData[,1:2]))
dim(linPredictors)

[1] 475   4

head(linPredictors)

     Class1     Class2     Class3      Class4
[1,]      0   6.317040  -9.318237  0.08539082
[2,]      0  12.543590 -12.791852 -4.73981916
[3,]      0  -6.235041   8.228932  0.74812465
[4,]      0   3.703185   6.059772 -9.85469145
[5,]      0  -3.401184  13.061642 -6.96120110
[6,]      0 -12.526772  14.730464  3.05226225

linPredictors<-t(apply(linPredictors,1,function(z) z+SoftmaxCoeff[,1]))
dim(linPredictors)

[1] 475   4

head(linPredictors)

     Class1      Class2     Class3      Class4
[1,]      0   2.4308066 -12.751342  -3.7411206
[2,]      0   8.6573571 -16.224956  -8.5663305
[3,]      0 -10.1212743   4.795827  -3.0783867
[4,]      0  -0.1830483   2.626667 -13.6812028
[5,]      0  -7.2874165   9.628537 -10.7877125
[6,]      0 -16.4130045  11.297359  -0.7742491

Check calculation for the first row of the data and second class.

row1<-myData[1,]
Class2<-SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*row1[1]+SoftmaxCoeff[2,3]*row1[2]
c(Class2,linPredictors[1,2])

$X1
[1] 2.430807

$Class2
[1] 2.430807

Create probabilities

softmaxProb<-exp(linPredictors)/apply(exp(linPredictors),1,sum)
apply(head(softmaxProb),1,sum)

[1] 1 1 1 1 1 1

Predict classes.

predClass<-apply(softmaxProb,1,which.max)
head(predClass)

[1] 2 2 3 3 3 3

Plot predicted classes and compare them with the data.

idx2Pred<-predClass==2
idx3Pred<-predClass==3
idx4Pred<-predClass==4

par(mfrow=c(1,2))
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2],myData$X2[idx2],pch=16,col="orange")
points(myData$X1[idx3],myData$X2[idx3],pch=16,col="cyan")
points(myData$X1[idx4],myData$X2[idx4],pch=16,col="magenta")

plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")

par(mfrow=c(1,1))

See how different classes are separated by hyperplanes.

Add hyperplane between class 1 and class 2:

plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")

Add hyperplane between class 1 and class 3.

plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[3,1]+SoftmaxCoeff[3,2]*myData$X1)/SoftmaxCoeff[3,3],col="grey")

Add hyperplane between class 1 and class 4.

plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[3,1]+SoftmaxCoeff[3,2]*myData$X1)/SoftmaxCoeff[3,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[4,1]+SoftmaxCoeff[4,2]*myData$X1)/SoftmaxCoeff[4,3],col="grey")

Series 7 – Fitting a Logistic Regression in Bayes

Logistic regression

Prediction of gender by hight and weight. Probability close to 0.5 (balanced data)

Prediction of gender by hight and weight. Probability close to extreme (imbalanced sample)

Robust logistic regression

Prediction of gender by height and weight. Robust model

Prediction of gender by height and weight with sparse data. Robust model

Anesthesia example

Data

Logistic model by `glm()`

Running chains

Analysis of logistic model

Analysis of robust logistic model

Comparison of logistic and robust logistic models

Softmax regression

Simulated data from the book

Softmax model

Analysis

Classification

Further reading

Corrections

Reuse

Citation

Series 7 – Fitting a Logistic Regression in Bayes

Logistic regression

Prediction of gender by hight and weight. Probability close to 0.5 (balanced data)

Prediction of gender by hight and weight. Probability close to extreme (imbalanced sample)

Robust logistic regression

Prediction of gender by height and weight. Robust model

Prediction of gender by height and weight with sparse data. Robust model

Anesthesia example

Data

Logistic model by glm()

Running chains

Analysis of logistic model

Analysis of robust logistic model

Comparison of logistic and robust logistic models

Softmax regression

Simulated data from the book

Softmax model

Analysis

Classification

Further reading

Corrections

Reuse

Citation

Logistic model by `glm()`